library(rtracklayer)
library(tidyverse)
library(ggforce)
library(GenomicRanges)
library(reshape2)
library(RColorBrewer)
# Load data
ATAC_granges <- import("../Data/ATAC_idr.optimal_peak.narrowPeak.gz", format="gz")
CAGE_granges <- import("../Data/ctss_files/hg38.CAGE_AF7N_Pool_81_A2.filt.ctss.bed.gz", format="gz")
genome(ATAC_granges) <- "hg38"
genome(CAGE_granges) <- "hg38"
ATAC_granges
## GRanges object with 258265 ranges and 6 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr15 89087912-89088828 * | <NA> 1000
## [2] chr1 206202745-206204098 * | <NA> 1000
## [3] chr11 6234200-6235551 * | <NA> 1000
## [4] chr11 3797056-3798236 * | <NA> 1000
## [5] chr8 25458212-25459500 * | <NA> 1000
## ... ... ... ... . ... ...
## [258261] chr10 106089928-106090175 * | <NA> 200
## [258262] chr10 104639508-104639928 * | <NA> 200
## [258263] chr10 104638719-104639064 * | <NA> 200
## [258264] chr10 103131426-103131736 * | <NA> 200
## [258265] chr10 101874851-101875892 * | <NA> 357
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <integer>
## [1] 22.9179 3542.55 3536.38 348
## [2] 15.9319 3417.05 3410.94 464
## [3] 17.5065 3325.87 3319.80 718
## [4] 20.1593 3283.13 3277.08 639
## [5] 18.0436 3234.07 3228.04 389
## ... ... ... ... ...
## [258261] 2.09155 3.82812 2.18792 93
## [258262] 2.09155 3.82812 2.18792 390
## [258263] 2.09155 3.82812 2.18792 237
## [258264] 2.09155 3.82812 2.18792 207
## [258265] 2.09155 3.82812 2.18792 177
## -------
## seqinfo: 24 sequences from hg38 genome; no seqlengths
CAGE_granges
## GRanges object with 1486902 ranges and 2 metadata columns:
## seqnames ranges strand | name score
## <Rle> <IRanges> <Rle> | <character> <numeric>
## [1] chr1 17496 - | chr1:17495-17496,- 1
## [2] chr1 184489 - | chr1:184488-184489,- 1
## [3] chr1 611298 + | chr1:611297-611298,+ 1
## [4] chr1 629080 + | chr1:629079-629080,+ 1
## [5] chr1 629081 + | chr1:629080-629081,+ 1
## ... ... ... ... . ... ...
## [1486898] chrY 56844175 + | chrY:56844174-56844175,+ 1
## [1486899] chrY 56844226 - | chrY:56844225-56844226,- 2
## [1486900] chrY 56849211 + | chrY:56849210-56849211,+ 1
## [1486901] chrY 56853769 - | chrY:56853768-56853769,- 2
## [1486902] chrY 56869591 - | chrY:56869590-56869591,- 1
## -------
## seqinfo: 24 sequences from hg38 genome; no seqlengths
# Extend ATAC peaks 250 bp in both directions
ATAC_granges_peaks <- GRanges(seqnames = seqnames(ATAC_granges),
ranges = IRanges(start = start(ATAC_granges) + ATAC_granges$peak, width = 1),
strand = strand(ATAC_granges))
genome(ATAC_granges_peaks) <- "hg38"
start(ATAC_granges_peaks) <- start(ATAC_granges_peaks) - 250
end(ATAC_granges_peaks) <- end(ATAC_granges_peaks) + 250
ranges(ATAC_granges_peaks)
## IRanges object with 258265 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## [1] 89088010 89088510 501
## [2] 206202959 206203459 501
## [3] 6234668 6235168 501
## [4] 3797445 3797945 501
## [5] 25458351 25458851 501
## ... ... ... ...
## [258261] 106089771 106090271 501
## [258262] 104639648 104640148 501
## [258263] 104638706 104639206 501
## [258264] 103131383 103131883 501
## [258265] 101874778 101875278 501
ATAC_granges_peaks
## GRanges object with 258265 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr15 89088010-89088510 *
## [2] chr1 206202959-206203459 *
## [3] chr11 6234668-6235168 *
## [4] chr11 3797445-3797945 *
## [5] chr8 25458351-25458851 *
## ... ... ... ...
## [258261] chr10 106089771-106090271 *
## [258262] chr10 104639648-104640148 *
## [258263] chr10 104638706-104639206 *
## [258264] chr10 103131383-103131883 *
## [258265] chr10 101874778-101875278 *
## -------
## seqinfo: 24 sequences from hg38 genome; no seqlengths
# Remove overlapping regions
ATAC_peaks_filtered <- rownames_to_column(as.data.frame(ATAC_granges_peaks)) %>%
as_tibble() %>% mutate(overlaps = countOverlaps(ATAC_granges_peaks)) %>%
filter(overlaps==1)
ATAC_peaks_filtered
ATAC_granges_peaks_filtered <- ATAC_granges_peaks[as.numeric(ATAC_peaks_filtered$rowname)]
ATAC_granges_peaks_filtered
## GRanges object with 81919 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr12 55973648-55974148 *
## [2] chr3 50235720-50236220 *
## [3] chr17 15699328-15699828 *
## [4] chr15 90716993-90717493 *
## [5] chr14 77616665-77617165 *
## ... ... ... ...
## [81915] chr10 112475607-112476107 *
## [81916] chr10 109443822-109444322 *
## [81917] chr10 108630531-108631031 *
## [81918] chr10 106089771-106090271 *
## [81919] chr10 103131383-103131883 *
## -------
## seqinfo: 24 sequences from hg38 genome; no seqlengths
ranges(ATAC_granges_peaks_filtered)
## IRanges object with 81919 ranges and 0 metadata columns:
## start end width
## <integer> <integer> <integer>
## [1] 55973648 55974148 501
## [2] 50235720 50236220 501
## [3] 15699328 15699828 501
## [4] 90716993 90717493 501
## [5] 77616665 77617165 501
## ... ... ... ...
## [81915] 112475607 112476107 501
## [81916] 109443822 109444322 501
## [81917] 108630531 108631031 501
## [81918] 106089771 106090271 501
## [81919] 103131383 103131883 501
length(ATAC_granges)
## [1] 258265
length(ATAC_granges_peaks_filtered)
## [1] 81919
# Get number of windows for each chr
table(seqnames(ATAC_granges_peaks_filtered))
##
## chr15 chr1 chr11 chr8 chr5 chr10 chr19 chr13 chr4 chr9 chr3 chr12 chr14
## 3068 7478 3981 3766 4583 3973 2190 2186 4343 3269 5942 4056 2649
## chr6 chr17 chr2 chr7 chr16 chrX chr18 chr22 chr20 chr21 chrY
## 4943 2945 6981 4212 2533 2612 1946 1291 2139 831 2
# Alternatively I can collpase overlapping ranges
GenomicRanges::reduce(ATAC_granges_peaks)
## GRanges object with 144900 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr15 19988001-19988501 *
## [2] chr15 20021287-20021787 *
## [3] chr15 20191686-20192186 *
## [4] chr15 20192954-20193710 *
## [5] chr15 20204491-20204991 *
## ... ... ... ...
## [144896] chr21 46637923-46638823 *
## [144897] chr21 46661197-46661697 *
## [144898] chr21 46667073-46668259 *
## [144899] chrY 4991691-4992191 *
## [144900] chrY 15355319-15355819 *
## -------
## seqinfo: 24 sequences from hg38 genome; no seqlengths
# Report time execution
report_time_execution <- function(fun){
start_time <- Sys.time()
output <- fun
end_time <- Sys.time()
print(end_time - start_time)
return(output)
}
# Return score if the position in present, 0 otherwise
get_count_vector <- function(pos, df){
if (pos %in% df$atac_relative_pos) {
return(df$score[which(df$atac_relative_pos == pos)])
} else {return(0)}
}
# Return score vector for each position for both strands
get_count_vector_both_strands <- function(df){
plus_count_vector <- sapply(1:501, get_count_vector, df = df[df$strand == "+",])
minus_count_vector <- sapply(1:501, get_count_vector, df = df[df$strand == "-",])
return(c(plus_count_vector, minus_count_vector))
}
# Return the CAGE profiles of the (CAGE) overlapping ATAC regions
get_chr_windows_profiles <- function(cage_granges, atac_granges, chr){
# Select chromosome
cage_granges <- cage_granges[seqnames(cage_granges) == chr]
atac_granges <- atac_granges[seqnames(atac_granges) == chr]
# Add all information into one df
overlaps <- findOverlaps(cage_granges, atac_granges) # Index of overlapping CAGE fragment
length(overlaps)
# Check if there are ATAC positive windows overlapping CAGE data
if (length(overlaps) > 0){
df <- cage_granges[queryHits(overlaps)] # Keep only overlapping CAGE data
df$index_overlapping_atac <- subjectHits(overlaps) # Add index of overlapping ATAC
df %>% as_tibble() %>% # Add ATAC start site and relative position
mutate(atac_start = start(atac_granges[subjectHits(overlaps)]),
atac_relative_pos = start - atac_start + 1) -> df
# Extract profiles of each (CAGE) overlapping ATAC region
profiles <- by(data = df,
INDICES = df$index_overlapping_atac,
FUN = function(x){get_count_vector_both_strands(x)})
profiles <- data.frame(do.call(rbind, profiles))
colnames(profiles) <- c(paste("Plus_", 1:501, sep = ""), paste("Minus_", 1:501, sep = ""))
# Add metadata information
profiles <- profiles %>% mutate(# atac_index = sort(unique(subjectHits(overlaps))),
atac_start = start(atac_granges[as.numeric(rownames(profiles))]),
chr = chr) %>% relocate(c(chr, atac_start), .before = Plus_1)
return(profiles)
}
else {
print(paste(chr, "contains no overlapping windows"))
return(NULL)
}
}
# Return the windows profiles for all chromosomes
get_windows_profiles <- function(cage_granges, atac_granges){
chromosomes <- unique(seqnames(cage_granges))
list_chr_profiles <- lapply(chromosomes, function(x)
{get_chr_windows_profiles(cage_granges = cage_granges,
atac_granges = atac_granges,
chr = x)})
output_list <- list()
output_list$profiles <- data.frame(do.call(rbind, list_chr_profiles))
output_list$metadata <- output_list$profiles %>% select(chr, atac_start)
output_list$profiles <- output_list$profiles %>% select(-chr, -atac_start)
return(output_list)
}
pos_windows <- report_time_execution(get_windows_profiles(CAGE_granges,
ATAC_granges_peaks_filtered))
## [1] "chrY contains no overlapping windows"
## Time difference of 2.192336 mins
rows = nrow(pos_windows$metadata)
rows
## [1] 14954
krows <- round(rows/1000)
## Explore the resulting profiles
# Plot chr distribution
plot_chr_distribution <- function(window_metadata,
title = "Chr distribution"){
window_metadata %>% group_by(chr) %>% count() %>%
ggplot(aes(x = factor(chr,
levels = paste("chr", c(1:22, "X", "Y"), sep="")),
y = n)) +
geom_bar(stat = "identity", col = "black", fill = brewer.pal(8,"Dark2")[1]) +
labs(title = title, x = "Chromosome", y = "N. windows") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust=1,size = 12))
}
plot_chr_distribution(pos_windows$metadata, paste("Chr distribution (Positive windows ", krows, "k)", sep=""))
# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
get_cage_distribution_by_peak_position <- function(peaks_profile_df){
apply(peaks_profile_df, 2, sum) %>% as_tibble() %>%
mutate(pos = c(-250:250, -250:250), strand = c(rep("+", 501), rep("-", 501))) %>%
rename(score = value) %>% relocate(score, .after = strand) %>%
mutate(score = ifelse(strand == "-", -score, score))
}
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(pos_windows$profiles)
cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
# facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()
# Plot distribution of ATAC-Seq peaks CAGE total coverage
get_windows_total_cage_score_distribution <- function(peaks_profile_df){
peaks_profile_df %>%
count(apply(peaks_profile_df, 1, sum)) %>%
rename(total_score = "apply(peaks_profile_df, 1, sum)", n_atac_peaks = n) %>%
relocate(total_score, .after = n_atac_peaks)
}
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(pos_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>%
ggplot(aes(x = total_score, y = n_atac_peaks)) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
labs(title = "Windows profiles total score distribution",
y = "N. Windows profiles",
x = "Total CAGE score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>%
ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
labs(title = "Windows profiles total score distribution",
x = NA,
y = "Total CAGE score",
size = "N. Windows profiles",
col = "") +
facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme(axis.title.x=element_blank())
# Plot the maximum tss score of each window
max_tss_score_distribution <- rownames_to_column(pos_windows$profiles, var = "atac_start") %>%
mutate(max_tss_score = apply(pos_windows$profiles, 1, max)) %>%
select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()
max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) +
facet_zoom(ylim=c(0, 60), shrink = FALSE) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
coord_cartesian(xlim = c(0, 50)) +
labs(title = "Max TSS score distribution",
y = "N. windows profiles",
x = "Max TSS score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
## Positive set filtering
# Try different filters and observe the resulting profiles
windows_profiles_filter <- function(windows,
threshold = 1, fun = max){
filter <- apply(windows$profiles, 1, fun) > threshold
windows$profiles <- windows$profile[filter,]
windows$metadata <- windows$metadata[filter,]
return(windows)
}
pos_windows_filtered_atleast_2_sum <- windows_profiles_filter(pos_windows, fun = sum)
pos_windows_filtered <- windows_profiles_filter(pos_windows)
pos_windows_filtered_atleast_3_max <- windows_profiles_filter(pos_windows, threshold = 2)
print(paste("Original windows number:", nrow(pos_windows$profiles)))
## [1] "Original windows number: 14954"
print(paste("N. windows after filtering (at least 2 reads in total):", nrow(pos_windows_filtered_atleast_2_sum$profiles)))
## [1] "N. windows after filtering (at least 2 reads in total): 9225"
print(paste("N. windows after filtering (at least a TSS with 2 reads):", nrow(pos_windows_filtered$profiles)))
## [1] "N. windows after filtering (at least a TSS with 2 reads): 6958"
print(paste("N. windows after filtering (at least a TSS with 3 reads):", nrow(pos_windows_filtered_atleast_3_max$profiles)))
## [1] "N. windows after filtering (at least a TSS with 3 reads): 3514"
rows = nrow(pos_windows_filtered$metadata)
rows
## [1] 6958
krows <- round(rows/1000)
## Plot after filtering
# Chr distribution
plot_chr_distribution(pos_windows_filtered$metadata, paste("Chr distribution (Positive windows ", krows, "k)", sep=""))
# Score distribution
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(pos_windows_filtered$profiles)
cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()
# Plot distribution of ATAC-Seq peaks CAGE total coverage
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(pos_windows_filtered$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>%
ggplot(aes(x = total_score, y = n_atac_peaks)) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
labs(title = "Windows profiles total score distribution (All chr)",
y = "N. Windows profiles",
x = "Total CAGE score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>%
ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
labs(title = "Windows profiles total score distribution (All chr)",
x = NA,
y = "Total CAGE score",
size = "N. Windows profiles",
col = "") +
facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme(axis.title.x=element_blank())
# Plot the maximum tss score of each window
max_tss_score_distribution <- pos_windows_filtered$profiles %>%
mutate(atac_start = pos_windows_filtered$metadata$atac_start) %>%
mutate(max_tss_score = apply(pos_windows_filtered$profiles, 1, max)) %>%
select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()
max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) +
facet_zoom(ylim=c(0, 60), shrink = FALSE) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
coord_cartesian(xlim = c(0, 50)) +
labs(title = "Max TSS score distribution (All chr)",
y = "N. windows profiles",
x = "Max TSS score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Get the distribution of CAGE score over one chr windows positions
get_score_distribution_by_pos_one_chr <- function(list_windows, chr){
windows_profile <- list_windows$profiles[list_windows$metadata == chr,]
score_distribution <- get_cage_distribution_by_peak_position(windows_profile)
score_distribution$chr = chr
return(score_distribution)
}
# Get the distribution of CAGE score for all chr
get_score_distribution_by_pos_all_chr <- function(list_windows){
list_df <- lapply(unique(list_windows$metadata$chr), function(x){
get_score_distribution_by_pos_one_chr(list_windows, as.character(x))})
return(data.frame(do.call(rbind, list_df)))
}
windows_score_distribution_by_pos_all_chr <-
get_score_distribution_by_pos_all_chr(pos_windows_filtered)
windows_score_distribution_by_pos_all_chr %>%
ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
labs(title = paste("Positive set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
coord_cartesian(ylim = c(-1000, 1000)) +
facet_wrap(~factor(chr,
levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()
# Plot some profiles
plo_set_profiles <- function(windows,
chr,
details = FALSE,
title = "",
ylim = c(-50, 50),
range = 1:56, sort = FALSE){
windows_profile <- windows$profiles[windows$metadata$chr == chr,]
windows_profile %>%
mutate(total_coverage = apply(windows_profile, 1, sum),
max_coverage = apply(windows_profile, 1, max)) -> windows_profile
# Add details (minimum TSS score and total score of the window)
if (details){
row_name <- paste("w_", 1:nrow(windows_profile), "\n(T=",
as.character(total_coverage), ", \nM=",
max_coverage, ")")
} else {row_name <- paste("w_", 1:nrow(windows_profile))}
windows_profile %>%
mutate(window = row_name) %>%
relocate(c(window, total_coverage), .before = Plus_1) -> temp
if (sort){
temp %>% arrange(desc(total_coverage)) %>% slice(range) -> temp
}
# Prepare for plotting
temp <- temp %>% select(-total_coverage, -max_coverage) %>% slice(range) %>% melt()
temp$value[grepl("^M", temp$variable)] <- -temp$value[grepl("^M", temp$variable)]
temp$strand[grepl("^M", temp$variable)] <- "-"
temp$strand[grepl("^P", temp$variable)] <- "+"
temp$variable <- as.numeric(gsub("\\D", "", temp$variable)) - 251
# Plot
temp %>% ggplot(aes(x = variable, y = value, color = strand)) + geom_line() +
facet_wrap(~window, ncol = 8, strip.position = "left") +
coord_cartesian(ylim = ylim) +
labs(title = paste("Windows profiles (",
deparse(substitute(range)), ", ", chr, ")", sep = ""),
y = "TSS score",
x = "Relative position to ATAC-Seq mid peak",
fill = NA) +
theme_bw() -> plot
return(plot)
}
plo_set_profiles(pos_windows_filtered, chr = "chr1", sort=TRUE)
## Using window as id variables
plo_set_profiles(pos_windows_filtered, chr = "chr6", sort=TRUE)
## Using window as id variables
plo_set_profiles(pos_windows_filtered, chr = "chr11", sort=TRUE)
## Using window as id variables
plo_set_profiles(pos_windows_filtered, chr = "chr2", sort=TRUE)
## Using window as id variables
plo_set_profiles(pos_windows_filtered, chr = "chr3", sort=TRUE)
## Using window as id variables
plo_set_profiles(pos_windows_filtered, chr = "chr4", sort=TRUE)
## Using window as id variables
# Export original atac data (before filtering them by overlaps) and positive set ranges
positive_set_granges <- GRanges(seqnames = pos_windows_filtered$metadata$chr,
ranges = IRanges(start = pos_windows_filtered$metadata$atac_start,
width = 501))
export(positive_set_granges, "../Data/pseudo_dreg_sets/positive_set_replicate1.bed", format = "bed")
export(ATAC_granges_peaks, "../Data/atac_positive_all_granges_501.bed", format = "bed")
# Duplicate 4 times the ATAC_peaks so to use as -i to generate more ranges with shuffle
ATAC_granges_duplicated <- GRanges(rbind(as.data.frame(ATAC_granges_peaks),
as.data.frame(shift(ATAC_granges_peaks, 1)),
as.data.frame(shift(ATAC_granges_peaks, 2)),
as.data.frame(shift(ATAC_granges_peaks, -1)),
as.data.frame(shift(ATAC_granges_peaks, -2))))
export(ATAC_granges_duplicated, "../Data/atac_positive_all_granges_501_duplicatedX4.bed", format = "bed")
# Generate negative set ranges (from pseudo_dreg directory) in such a way that:
# - They will be the same size, num and strand proportion as positive set ranges, blacklist and atac positive granges will not be included, no overlaps, try to include CAGE TSS
# - Risk: forcing the overlaps with CAGE I might risk to capture some true TSS signals that are in region where ATAC-Seq was not able to capture an actual open chromatine region
bedtools shuffle -i positive_set_replicate1.bed -g ../hg38.bed -incl ../ctss_files/timepoint_0/hg38.CAGE_AF7N_Pool_81_A2.filt.ctss.bed.gz -excl hg38-blacklist.v2.bed -excl ../atac_positive_all_granges_501.bed -noOverlapping > negative_set_replicate1_a1.bed
# - They will be the same size, num and strand proportion as atac positive granges, blacklist and atac positive granges will not be included, allow overlaps, don't force CAGE TSS
# - I will need to reiterate several times before I reach the desired number of profiles with at least a TSS
bedtools shuffle -i ../atac_positive_all_granges_501.bed -g ../hg38.bed -excl hg38-blacklist.v2.bed -excl ../atac_positive_all_granges_501.bed > negative_set_replicate1_b1.bed
# Do the same but without creating overlaps in the generated ranges
bedtools shuffle -i ../atac_positive_all_granges_501_duplicatedX4.bed -g ../hg38.bed -excl hg38-blacklist.v2.bed -excl ../atac_positive_all_granges_501.bed -noOverlapping > negative_set_replicate1_duplicateX4_noOverlaps.bed
Method A: I start and end 7k ranges that overlap atleast with 1 CAGE (forced by bedtools shuffle) Method B: Starting with 258k ranges I obtain 8k that overlap atleast with 1 CAGE (random shuffle ATAC negative regions) Starting with 650k, obtained 22k
# Import negative set ranges and check for overlaps
negative_set_granges <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_a1.bed", format = "bed")
findOverlaps(negative_set_granges, positive_set_granges)
## Hits object with 0 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## -------
## queryLength: 6958 / subjectLength: 6958
# Extract CAGE profiles
neg_windows <- report_time_execution(get_windows_profiles(CAGE_granges,
negative_set_granges))
## Time difference of 1.082768 mins
neg_windows$metadata
rows = nrow(neg_windows$metadata)
rows
## [1] 6958
krows <- round(rows/1000)
## Explore the resulting profiles
# Chr distribution
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""))
# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows$profiles)
cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
# facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()
# Plot distribution of ATAC-Seq peaks CAGE total coverage
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>%
ggplot(aes(x = total_score, y = n_atac_peaks)) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
labs(title = "Windows profiles total score distribution",
y = "N. Windows profiles",
x = "Total CAGE score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>%
ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
labs(title = "Windows profiles total score distribution",
x = NA,
y = "Total CAGE score",
size = "N. Windows profiles",
col = "") +
facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme(axis.title.x=element_blank())
# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows$profiles %>%
mutate(atac_start = neg_windows$metadata$atac_start) %>%
mutate(max_tss_score = apply(neg_windows$profiles, 1, max)) %>%
select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()
max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) +
#facet_zoom(ylim=c(0, 60), shrink = FALSE) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
coord_cartesian(xlim = c(0, 50)) +
labs(title = "Max TSS score distribution",
y = "N. windows profiles",
x = "Max TSS score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-
get_score_distribution_by_pos_all_chr(neg_windows)
windows_score_distribution_by_pos_all_chr %>%
ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
coord_cartesian(ylim = c(-1000, 1000)) +
facet_wrap(~factor(chr,
levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()
# Plot some profiles
plo_set_profiles(neg_windows, chr = "chr1", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr6", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr11", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr2", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr3", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr4", sort=TRUE)
## Using window as id variables
# Import negative set ranges and check for overlaps
negative_set_granges <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b1.bed", format = "bed")
findOverlaps(negative_set_granges, positive_set_granges)
## Hits object with 0 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## -------
## queryLength: 258265 / subjectLength: 6958
sum(countOverlaps(negative_set_granges) > 1)
## [1] 21357
# Extract CAGE profiles
neg_windows <- report_time_execution(get_windows_profiles(CAGE_granges,
negative_set_granges))
## Time difference of 1.46191 mins
rows = nrow(neg_windows$metadata)
rows
## [1] 8123
krows <- round(rows/1000)
## Explore the resulting profiles
# Chr distribution
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows, ", krows, "k)", sep=""))
# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows$profiles)
cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
# facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()
# Plot distribution of ATAC-Seq peaks CAGE total coverage
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>%
ggplot(aes(x = total_score, y = n_atac_peaks)) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
labs(title = "Windows profiles total score distribution",
y = "N. Windows profiles",
x = "Total CAGE score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>%
ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
labs(title = "Windows profiles total score distribution",
x = NA,
y = "Total CAGE score",
size = "N. Windows profiles",
col = "") +
facet_zoom(ylim=c(0, 10000), shrink = FALSE) +
scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme(axis.title.x=element_blank())
# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows$profiles %>%
mutate(atac_start = neg_windows$metadata$atac_start) %>%
mutate(max_tss_score = apply(neg_windows$profiles, 1, max)) %>%
select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()
max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) +
#facet_zoom(ylim=c(0, 60), shrink = FALSE) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
coord_cartesian(xlim = c(0, 50)) +
labs(title = "Max TSS score distribution",
y = "N. windows profiles",
x = "Max TSS score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-
get_score_distribution_by_pos_all_chr(neg_windows)
windows_score_distribution_by_pos_all_chr %>%
ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position (", krows, "k)", sep=""),
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
coord_cartesian(ylim = c(-1000, 1000)) +
facet_wrap(~factor(chr,
levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()
# Plot some profiles
plo_set_profiles(neg_windows, chr = "chr1", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr6", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr11", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr2", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr3", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr4", sort=TRUE)
## Using window as id variables
# Import negative set ranges and check for overlaps
#negative_set_granges_1 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b1.bed", format = "bed")
#negative_set_granges_2 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b2.bed", format = "bed")
#negative_set_granges_3 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b3.bed", format = "bed")
#negative_set_granges_4 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b4.bed", format = "bed")
#negative_set_granges_5 <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_b5.bed", format = "bed")
#negative_set_granges <- GenomicRanges::union(negative_set_granges_1, negative_set_granges_2)
#negative_set_granges <- GenomicRanges::union(negative_set_granges, negative_set_granges_3)
#negative_set_granges <- GenomicRanges::union(negative_set_granges, negative_set_granges_4)
#negative_set_granges <- GenomicRanges::union(negative_set_granges, negative_set_granges_5)
# Import negative set ranges and check for overlaps
negative_set_granges <- import("../Data/pseudo_dreg_sets/negative_set_replicate1_duplicateX4_noOverlaps.bed", format = "bed")
# Check for overlaps with ATAC positive regions and between granges in the same file
findOverlaps(negative_set_granges, positive_set_granges)
## Hits object with 0 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## -------
## queryLength: 1291325 / subjectLength: 6958
paste("Ranges before extraction:", length(negative_set_granges))
## [1] "Ranges before extraction: 1291325"
paste("Within overlaps:", sum(countOverlaps(negative_set_granges) > 1))
## [1] "Within overlaps: 0"
# Extract CAGE profiles
neg_windows <- report_time_execution(get_windows_profiles(CAGE_granges,
negative_set_granges))
## Time difference of 5.977751 mins
# Check for profiles without overlapping CAGE data
non_overlapping_ranges <- apply(neg_windows$profiles, 1, sum) == 0
if (sum(non_overlapping_ranges) > 0){
print(paste("Removing", sum(non_overlapping_ranges), "non-overlapping ranges"))
neg_windows$profiles <- neg_windows$profiles[!non_overlapping_ranges,]
neg_windows$metadata <- neg_windows$metadata[!non_overlapping_ranges,]
}
rows = nrow(neg_windows$metadata)
paste("Extracted profiles:", rows)
## [1] "Extracted profiles: 40847"
krows <- round(rows/1000)
## Explore the resulting profiles
# Chr distribution
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows ", krows, "k)", sep=""))
# Plot the distribution of CAGE coverage for ATAC-Seq peak relative positions
cage_distribution_by_peak_position <- get_cage_distribution_by_peak_position(neg_windows$profiles)
cage_distribution_by_peak_position %>% ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
# facet_zoom(ylim=c(-3000, 1000), shrink = FALSE) +
labs(title = "CAGE score distribution over ATAC-Seq peaks relative position (All chr)",
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) + theme_bw()
# Plot distribution of ATAC-Seq peaks CAGE total coverage
windows_total_cage_score_distribution <- get_windows_total_cage_score_distribution(neg_windows$profiles)
windows_total_cage_score_distribution
windows_total_cage_score_distribution %>% arrange(desc(total_score))
# Focus on ATAC-Seq peaks number
windows_total_cage_score_distribution %>% filter(total_score < 50) %>%
ggplot(aes(x = total_score, y = n_atac_peaks)) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[5]) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
labs(title = "Windows profiles total score distribution",
y = "N. Windows profiles",
x = "Total CAGE score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Focus on CAGE total coverage per peak
windows_total_cage_score_distribution %>% mutate(type = "Windows profiles") %>%
ggplot(aes(x = type, y = total_score)) + geom_violin(fill= brewer.pal(8,"Greys")[3]) +
geom_jitter(aes(size = n_atac_peaks, col = n_atac_peaks), alpha=0.5) + theme_bw() +
labs(title = "Windows profiles total score distribution",
x = NA,
y = "Total CAGE score",
size = "N. Windows profiles",
col = "") +
facet_zoom(ylim=c(0, 1000), shrink = FALSE) +
scale_colour_gradientn(colours = c("blue", "red"), values = c(0, 0.5, 1)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
theme(axis.title.x=element_blank())
# Plot the maximum tss score of each window
max_tss_score_distribution <- neg_windows$profiles %>%
mutate(atac_start = neg_windows$metadata$atac_start) %>%
mutate(max_tss_score = apply(neg_windows$profiles, 1, max)) %>%
select(atac_start, max_tss_score) %>% group_by(max_tss_score) %>% count()
max_tss_score_distribution
max_tss_score_distribution %>% ggplot(aes(x = max_tss_score, y = n, fill = "red")) +
geom_bar(stat = "identity", color = "black", fill = brewer.pal(8,"Dark2")[3]) +
#facet_zoom(ylim=c(0, 60), shrink = FALSE) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 15)) +
coord_cartesian(xlim = c(0, 50)) +
labs(title = "Max TSS score distribution",
y = "N. windows profiles",
x = "Max TSS score",
fill = NA) +
theme_bw() +
theme(legend.position = "none")
# Plot different chromosomes score distribution
windows_score_distribution_by_pos_all_chr <-
get_score_distribution_by_pos_all_chr(neg_windows)
windows_score_distribution_by_pos_all_chr %>%
ggplot(aes(x = pos, y = score, color = strand)) + geom_line() +
labs(title = paste("Negative set CAGE score distribution over ATAC-Seq peaks relative position", krows, "k)", sep=""),
x = "Relative position to ATAC mid peaks", y = "Sum of scores over windows") +
coord_cartesian(ylim = c(-1000, 1000)) +
facet_wrap(~factor(chr,
levels = paste("chr", c(1:22, "X", "Y"), sep=""))) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5)) + theme_bw()
# Plot some profiles
plo_set_profiles(neg_windows, chr = "chr1", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr6", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr11", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr2", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr3", sort=TRUE)
## Using window as id variables
plo_set_profiles(neg_windows, chr = "chr4", sort=TRUE)
## Using window as id variables
# Sample from negative set
windows_sampling <- function(windows, size){
uniform_sampling <- runif(size, 1, nrow(windows$metadata))
windows$metadata <- windows$metadata[uniform_sampling,]
windows$profiles <- windows$profiles[uniform_sampling,]
return(windows)
}
# Plot chr distribution
plot_chr_distribution(pos_windows_filtered$metadata, paste("Chr distribution (Positive windows, ",
round(nrow(pos_windows_filtered$metadata)/1000), "k)", sep = ""))
plot_chr_distribution(neg_windows$metadata, paste("Chr distribution (Negative windows, ",
round(nrow(neg_windows$metadata)/1000), "k)", sep = ""))
neg_windows_sampled <- windows_sampling(neg_windows, size=nrow(pos_windows_filtered$metadata))
plot_chr_distribution(neg_windows_sampled$metadata, paste("Chr distribution (Negative windows sampled, ",
round(nrow(neg_windows_sampled$metadata)/1000), "k)", sep = ""))
## Export data for ML feeding
# Merge positive and negative examples
windows_profile <- rbind(mutate(pos_windows_filtered$profiles, label = 1),
mutate(neg_windows_sampled$profiles, label = 0))
windows_metadata <- rbind(mutate(pos_windows_filtered$metadata, label = 1),
mutate(neg_windows_sampled$metadata, label = 0))
# Shuffle the data
index <- sample(nrow(windows_profile))
windows_profile <- windows_profile[index,]
windows_metadata <- windows_metadata[index,]
paste("Size windows profile", nrow(windows_profile))
## [1] "Size windows profile 13916"
paste("Size positive windows profile (filtered)", nrow(pos_windows_filtered$profile))
## [1] "Size positive windows profile (filtered) 6958"
paste("Size negative windows profile (sampled)", nrow(neg_windows_sampled$profile))
## [1] "Size negative windows profile (sampled) 6958"
# Divide train and test by chr
test_index <- windows_metadata$chr %in% c("chr2", "chr3", "chr4")
windows_profile_test <- windows_profile[test_index,]
windows_metadata_test <- windows_metadata[test_index,]
windows_profile_train <- windows_profile[!test_index,]
windows_metadata_train <- windows_metadata[!test_index,]
# Export
write_csv(windows_profile, "../Data/ML_input/profiles_replicate1_b2.csv")
write_csv(windows_metadata, "../Data/ML_input/metadata_replicate1_b2.csv")
write_csv(windows_profile_test, "../Data/ML_input/profiles_replicate1_b2_test.csv")
write_csv(windows_metadata_test, "../Data/ML_input/metadata_replicate1_b2_test.csv")
write_csv(windows_profile_train, "../Data/ML_input/profiles_replicate1_b2_train.csv")
write_csv(windows_metadata_train, "../Data/ML_input/metadata_replicate1_b2_train.csv")